Instruction

Starting from importing packages until data cleaning is essential. Remaining code blocks are independent of each other, can be ran individually.

Import Packages

Specify Paths

# inputs
dataset_path <- "./Data/"
function_path <- "./Functions/"

# outputs
itrt_plot_path <- "./Output/InteractivePlot/"
sttc_plot_path <- "./Output/StaticPlot/"
out_data_path <- "./Output/Data/"

Import Dataset

daily_covid <- import(
    paste0(dataset_path, "worldometer_coronavirus_daily_data.csv")
    )

summary_covid <- import(
    paste0(dataset_path, "worldometer_coronavirus_summary_data.csv")
    )

head(daily_covid)
##         date     country cumulative_total_cases daily_new_cases active_cases
## 1 2020-02-15 Afghanistan                      0              NA            0
## 2 2020-02-16 Afghanistan                      0              NA            0
## 3 2020-02-17 Afghanistan                      0              NA            0
## 4 2020-02-18 Afghanistan                      0              NA            0
## 5 2020-02-19 Afghanistan                      0              NA            0
## 6 2020-02-20 Afghanistan                      0              NA            0
##   cumulative_total_deaths daily_new_deaths
## 1                       0               NA
## 2                       0               NA
## 3                       0               NA
## 4                       0               NA
## 5                       0               NA
## 6                       0               NA
head(summary_covid)
##       country     continent total_confirmed total_deaths total_recovered
## 1 Afghanistan          Asia          158275         7367          145750
## 2     Albania        Europe          213257         3228          202077
## 3     Algeria        Africa          220415         6310          151347
## 4     Andorra        Europe           25289          141           21511
## 5      Angola        Africa           86636         1789           67477
## 6    Anguilla North America            1777            6            1702
##   active_cases serious_or_critical total_cases_per_1m_population
## 1         5158                1124                          3932
## 2         7952                  23                         74227
## 3        62758                  34                          4893
## 4         3637                  31                        326512
## 5        17370                   7                          2518
## 6           69                  NA                        116869
##   total_deaths_per_1m_population total_tests total_tests_per_1m_population
## 1                            183      826810                         20541
## 2                           1124     1495002                        520354
## 3                            140      230861                          5125
## 4                           1820      249838                       3225714
## 5                             52     1296669                         37686
## 6                            395       51382                       3379283
##   population
## 1   40250878
## 2    2873049
## 3   45046063
## 4      77452
## 5   34407243
## 6      15205

Data Cleaning

daily_covid <-
    daily_covid %>%
    replace(is.na(.), 0) %>% 
    mutate(date = as.Date(date))

Data Manipulation and visualization

Generic Questions and Answers

Question 1

What is the overview of covid cases?

## global percentage of death, active case and recovered ##

# sum vertically
categories <- c("total_deaths", "total_recovered", "active_cases")
category <- str_replace_all(categories, pattern =  "_", replacement = " ")
category <- str_to_title(category)

data <- 
    summary_covid[, categories] %>%
    colSums(na.rm = T)
data <- data.frame(
  category=category,
  count=data
)

data$prettyCount <- prettyNum(data$count, big.mark = ",", scientific = F)

# Compute percentages
data$fraction <- data$count / sum(data$count)

# Compute the cumulative percentages (top of each rectangle)
data$ymax <- cumsum(data$fraction)

# Compute the bottom of each rectangle
data$ymin <- c(0, head(data$ymax, n=-1))

# Compute label position
data$labelPosition <- (data$ymax + data$ymin) / 2

# Compute display percentages
data$prettyFraction <- percent(data$fraction)

# Make the plot
q1 <- 
    ggplot(data, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=category)) +
    geom_rect() +
    geom_text( 
        x=4.3, 
        aes(y=labelPosition, label=prettyCount, color=category, fontface="bold"), 
        size=3.5
    ) + # x here controls label position (inner / outer)
    geom_text(
        x=3.5, 
        aes(y=labelPosition, label=prettyFraction, fontface="bold"), 
        color="white",
        size=4
    ) +
    scale_fill_brewer(palette="Set2") +
    scale_color_brewer(palette="Set2") +
    coord_polar(theta="y") +
    xlim(c(2, 4)) +
    theme_void() +
    annotate(
        geom = "text", 
        x = 2, 
        y = 0, 
        colour = "#eba834",
        label = paste0(
            "Total Cases\n", 
            prettyNum(sum(data$count), big.mark = ",", scientific = F)
        )
    )

ggsave(paste0(sttc_plot_path, "qa1_cases_proportion.png"))
## Saving 7 x 5 in image
q1

Question 2

What is the scale of infected population in different continent or country?

## comparison of cases between different continent ##

q2 <- 
    summary_covid %>% # data
    select(country:active_cases) %>%
    group_by(continent) %>% # group_by 
    filter(total_confirmed > quantile(total_confirmed, 0.7)) %>% # removing small cases
    ungroup() %>% 
    group_by(continent, country) %>% 
    # turning 3 columes into sub sub group (wide to long conversion)
    gather(category, count, total_recovered, active_cases, total_deaths, factor_key=T) %>% 
    ungroup() %>% 
    mutate(category = factor(category, labels = c("Recovered", "Active Cases", "Deaths"))) %>% 
    treemap( index=c("continent","country","category"),
             vSize="count",
             type="index",
             palette = "Set2",
             title = "Group by continent top 70 percentile confirmed cases",
             align.labels=list(
                 c("center", "center"),
                 c("left", "top"),
                 c("left", "bottom")
             )
        )

itrt_q2 <- d3tree2( q2 ,  rootname = "Group by continent top 70 percentile confirmed cases" )
saveWidget(itrt_q2, file = paste0(itrt_plot_path, "qa2_infection_scale.html"))

itrt_q2

Question 3

How many people suffered from covid?

# overview of accumulated cases vs date for all the country

# global stacked area plot
data <- 
    
    # group by summation
    daily_covid %>% 
    group_by(date) %>% 
    summarise(
        cumulative_total_cases = sum(cumulative_total_cases, na.rm = T),
        cumulative_total_deaths = sum(cumulative_total_deaths, na.rm = T),
    ) %>% 
    
    # convert wide columns to long rows
    gather(categories, count,
           cumulative_total_cases, cumulative_total_deaths) %>% 
    
    # 
    rowwise() %>% 
    mutate(text = 
               paste(
                   str_to_title(last(strsplit(categories, "_")[[1]])),
                   "Count:", comma(count),
                   "\nDate:", as.Date(date, format = "%d %b %Y")
               )
    ) %>% 
    arrange(date) # this is just to check if text is appended properly

write.csv(data, paste0(out_data_path, "qa3_cumulative_cases_by_date.csv"))

facet_labels <- c(
    'cumulative_total_cases'="Cumulative Cases",
    'cumulative_total_deaths'="Cumulative Deaths"
)

q3 <-          # I can't use any function in the text argument \|/
    ggplot(data, aes(x=date, y=count, group=categories, fill=categories, text = text)) +
    geom_area(alpha=0.8 , size=0.5, color="black") +
    facet_wrap(~categories, scales = "free_y",  labeller = as_labeller(facet_labels)) +
    scale_fill_viridis(discrete = T, option="B", begin = 0.3, end = 0.7) +
    scale_x_date(date_labels = "%b %Y") +
    scale_y_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
    theme(legend.position="none") +
    ggtitle("Cumulative Covid Cases") +
    ylab("Covid Cases") +
    xlab("Date") +
    theme_ipsum() +
    theme(legend.position="none", axis.text.x = element_text(angle=45, hjust = 1))

ggsave(paste0(sttc_plot_path, "qa3_cumulative_cases_by_date.png"))
## Saving 7 x 5 in image
itrt_q3 <- ggplotly(q3, tooltip = "text")
saveWidget(itrt_q3, file = paste0(itrt_plot_path, "qa3_cumulative_cases_by_date.html"))

# q3
itrt_q3

Question 4

Which country has the most cases?

# ranking of cases for the top n countries
# hist

Question 5

How did the number of active cases evolve starting from the beginning?

# ALERT !!! THIS SECTION CREATES GIF PLOTS, IT REQUIRE ROUGHLY 5 MINUTES TO RUN
# This block of code is disabled, remove eval=F to evaluate
# 100 MB of images will be generated during the process
# The final GIF is around 20MD with a resolution of 1920 x 1065, 8 fps

# An extra standalone R script (./qa5.R) is available for this section
# In case that it doesn't work in Rmarkdown

# map, gif encoder, list sorting
pacman::p_load(maps,gifski,gtools)

# map boarder data
world <- map_data("world")

# loop config
date_seq <- seq(as.Date(min(daily_covid$date)), 
                as.Date(max(daily_covid$date)), "days")
print_frequrency <- 50
len_date_seq <- length(date_seq)

max_active_cases <- max(daily_covid$active_cases)
min_active_cases <- min(daily_covid$active_cases)

# generate active cases world map images
for (i in c(1:len_date_seq)) {
    
    current_date <- as.Date(date_seq[i])
    date_daily_covid <- filter(daily_covid, date == current_date)
    mapdata <- left_join(world, date_daily_covid, by= c("region" = "country"))
    
    map <-
        ggplot(mapdata, aes(x=long, y=lat, group=group)) +
        geom_polygon(
            aes(fill = active_cases), 
            color="black",
            size= 0.2
        ) +
        scale_fill_distiller(
            name="Active Cases", 
            palette = "Spectral",
            na.value = "grey50",
            trans = "log10",
            limits= c(min_active_cases, max_active_cases)
        ) +
        ggtitle(paste0("Date: ", current_date)) +
        xlab(element_blank()) +
        ylab(element_blank()) +
        guides(fill = guide_colourbar(
            barwidth = 0.5, 
            barheight = 10,
            ticks = F
        )) +
        theme(
            plot.title = element_text(size=12),
            panel.background = element_rect(
                colour = "black", 
                fill = "white",
                size = 0.2
            ),
            axis.text.x = element_blank(),
            axis.text.y = element_blank(),
            axis.ticks = element_blank(),
            panel.grid.major.x = element_blank(),
            panel.grid.minor.x = element_blank(),
            panel.grid.major.y = element_blank(),
            panel.grid.minor.y = element_blank()
        )
    
    # save plot
    suppressMessages(ggsave(
        plot = map,
        filename = paste0(itrt_plot_path, "/qa5_map/", i,".png")
    ))
    
    # print log
    if ((i %% print_frequrency) == 0) {
        print(paste(i, "/", len_date_seq))
    }
    
}

# load png paths and convert it into gif
png_files <- list.files(paste0(itrt_plot_path, "qa5_map/"), 
                        pattern = ".*png$", full.names = TRUE)
png_files <- mixedsort(sort(png_files))
gifski(png_files, gif_file = paste0(itrt_plot_path, "qa5_map_08fps.gif"), 
       width = 1920, height = 1065, delay = 0.125)

pacman::p_unload(maps,gifski,gtools)

Generated Images
715 images located in ./Output/InteractivePlot/qa5_map/

Generated GIF

More in depth Questions

Question 1

We can easily tell from previous plots that most of the cases are from big countries. Is there any relation between population and Covid cases?

# we can easily tell from previous plots that most of the cases are from big
# countries. Now, I'm curious about the relation between population & Covid cases

q11 <- summary_covid %>%
    
    # Reorder countries to having big bubbles at the back
    arrange(desc(population)) %>%
    
    # prepare text for tooltip
    mutate(text = 
               paste0(
                   "Country: ", country, 
                   "\nPopulation: ", comma(population), 
                   "\nTotal Cases:\t", comma(total_confirmed), 
                   "\nTotal Tests\t", comma(total_tests)
               )
           ) %>%
    
    ggplot( 
        aes(
            x = population, 
            y = total_confirmed, 
            fill = continent, 
            size = total_tests, 
            text = text
        )
    ) +
    geom_point(alpha=0.5, color = "black", shape = 21, na.rm = T) +
    scale_x_log10(
        labels = unit_format(unit = "M", scale = 1e-6),
        breaks = 1e+3 * 10^(seq(0,20,2)),
    ) +
    scale_y_log10(
        labels = unit_format(unit = "M", scale = 1e-6),
        breaks = 10^(seq(1,21,2)),
    ) +
    scale_size(range = c(2, 25), name="Total Tests (M), Size") +
    labs(fill = 'Continent, Color') +
    scale_fill_viridis(discrete=T, option = "D") +
    coord_cartesian(clip = "off") +
    ylab("Covid Cases (M), log10(n)") +
    xlab("Population (M), log10(n)") +
    theme_bw()
ggsave(paste0(sttc_plot_path, "qb1_infection_population.png"))
## Saving 12 x 6 in image
# turn interactive ggplot with plotly and save it
itrt_q11 <- ggplotly(q11, tooltip="text")
saveWidget(itrt_q11, file = paste0(itrt_plot_path, "qb1_infection_population.html"))

q11

itrt_q11

Question 2

Which country did well in this pandemic war?

Question 3

can weather, geometric position affect the spread of covid?

Question 4

an even more in depth question, we’ve seen that population definitely affect how covid spread, let’s check and see why social distancing is needed.

Question 5

How did the R rate behave starting from the beginning of covid?

clean up

# pacman::p_unload(all)